Lab 1b. Species Distribution Modeling - Logistic Regression

Explore (continued)

librarian::shelf(
  DT, dplyr, dismo, GGally, here, readr, tidyr)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)

dir_data    <- here("ferret_data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

# pts_env_csv is a dataframe of all the pseudo absence points that we created in the last lab 1a

pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 404

Let’s look at a pairs plot (using GGally::ggpairs()) to show correlations between variables.

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero

## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).
## Warning: Removed 8 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 8 rows containing missing values
## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

## Warning: Removed 8 rows containing missing values (geom_point).

Section 2: Logistic Regression

2.1: Setup Data

Let’s setup a data frame with only the data we want to model by:

  • Dropping rows with any NAs. Later we’ll learn how to “impute” values with guesses so as to not throw away data.
  • Removing terms we don’t want to model. We can then use a simplified formula present∼. to predict present based on all other fields in the data frame (i.e. the X`s in y∼x1+x2+…xn).
# setup model data
d <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 396

2.2: Linear Model

Let’s start as simply as possible with a linear model lm() on multiple predictors X to predict presence y using a simpler workflow.

# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
## 
## Call:
## lm(formula = present ~ ., data = d)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9343 -0.3526  0.1220  0.3077  0.8730 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -2.2044769  1.1851918  -1.860  0.06364 .  
## WC_alt             0.0007108  0.0001321   5.381 1.28e-07 ***
## WC_bio1            0.0742681  0.0593167   1.252  0.21130    
## WC_bio4           -0.0044213  0.0072958  -0.606  0.54486    
## WC_bio12          -0.0004700  0.0001607  -2.925  0.00364 ** 
## ER_minTempWarmest  0.0034410  0.0044083   0.781  0.43553    
## lon                0.0237295  0.0069365   3.421  0.00069 ***
## lat                0.0934411  0.0185982   5.024 7.73e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.416 on 388 degrees of freedom
## Multiple R-squared:  0.3215, Adjusted R-squared:  0.3093 
## F-statistic: 26.27 on 7 and 388 DF,  p-value: < 2.2e-16
y_predict <- predict(mdl, d, type="response")
y_true    <- d$present

range(y_predict)
## [1] -0.2498215  0.9342597
range(y_true)
## [1] 0 1

The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)

2.3 Generalized Linear Model

To solve this problem of constraining the response term to being between the two possible values, i.e. the probability p of being one or the other possible y values, we’ll apply the logistic transformation on the response term.

logit(pi)=loge(pi1−pi)

We can expand the expansion of the predicted term, i.e. the probability p of being either y, with all possible predictors X whereby each coeefficient b gets multiplied by the value of x:

loge(pi1−pi)=b0+b1x1,i+b2x2,i+⋯+bkxk,i

# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2930  -0.8451  -0.0788   0.7735   2.4495  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.8944167  7.6596321  -0.247 0.804657    
## WC_alt             0.0029223  0.0008076   3.618 0.000297 ***
## WC_bio1            0.0571172  0.3527970   0.162 0.871386    
## WC_bio4           -0.0504369  0.0416534  -1.211 0.225945    
## WC_bio12          -0.0054702  0.0014234  -3.843 0.000121 ***
## ER_minTempWarmest  0.0324760  0.0259127   1.253 0.210101    
## lon                0.1434753  0.0416323   3.446 0.000568 ***
## lat                0.3769769  0.1129234   3.338 0.000843 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 548.81  on 395  degrees of freedom
## Residual deviance: 392.84  on 388  degrees of freedom
## AIC: 408.84
## 
## Number of Fisher Scoring iterations: 5
y_predict <- predict(mdl, d, type="response")

range(y_predict)
## [1] 0.001944451 0.927848989

Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.

# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F, ylim="free")

## Generalized Additive Model

With a generalized additive model we can add “wiggle” to the relationship between predictor and response by introducing smooth s() terms.

librarian::shelf(mgcv)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
# my variables: "WC_alt", "WC_bio1", "WC_bio4", "WC_bio12", "ER_minTempWarmest"

# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio4) + s(WC_bio12) + s(ER_minTempWarmest) + s(lon) + s(lat), 
  family = binomial, data = d)
summary(mdl)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio4) + s(WC_bio12) + 
##     s(ER_minTempWarmest) + s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.7692     0.2117  -3.634 0.000279 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                        edf Ref.df Chi.sq  p-value    
## s(WC_alt)            1.000  1.000  0.385 0.535144    
## s(WC_bio1)           1.000  1.000  0.938 0.332842    
## s(WC_bio4)           5.649  6.790 23.770 0.001218 ** 
## s(WC_bio12)          1.000  1.000 29.740  < 2e-16 ***
## s(ER_minTempWarmest) 1.996  2.550  1.239 0.562900    
## s(lon)               1.000  1.000 14.732 0.000124 ***
## s(lat)               7.898  8.636 40.805  8.3e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.48   Deviance explained =   44%
## UBRE = -0.12051  Scale est. = 1         n = 396
# show term plots
plot(mdl, scale=0)

2.5 Maxent (Maximum Entropy)

Maxent is probably the most commonly used species distribution model (Elith 2011) since it performs well with few input data points, only requires presence points (and samples background for comparison) and is easy to use with a Java graphical user interface (GUI).

# load extra packages
librarian::shelf(
  maptools, sf)
## 
##   The 'cran_repo' argument in shelf() was not set, so it will use
##   cran_repo = 'https://cran.r-project.org' by default.
## 
##   To avoid this message, set the 'cran_repo' argument to a CRAN
##   mirror URL (see https://cran.r-project.org/mirrors.html) or set
##   'quiet = TRUE'.
mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds")

# show version of maxent
if (!interactive())
  maxent()
## Loading required namespace: rJava
## This is MaxEnt version 3.4.3

This is MaxEnt version 3.4.3

# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)

# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>% 
  sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds)){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl)

# plot term plots
response(mdl)

# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')

plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')

Notice how the plot() function produces different outputs depending on the class of the input object. You can view help for each of these with R Console commands: ?plot.lm, ?plot.gam and plot,DistModel,numeric-method.

Next time we’ll split the data into test and train data and evaluate model performance while learning about tree-based methods.